# R packages
library(ggplot2)
library(openxlsx)
library(reshape2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(RColorBrewer)
library(circlize)
## ========================================
## circlize version 0.4.15
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: https://jokergoo.github.io/circlize_book/book/
## 
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
##   in R. Bioinformatics 2014.
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(circlize))
## ========================================
library(plyr)
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:plotly':
## 
##     arrange, mutate, rename, summarise
# Bioconductor packages
library(limma)
library(edgeR)
library(variancePartition)
## Loading required package: BiocParallel
## 
## Attaching package: 'variancePartition'
## The following object is masked from 'package:limma':
## 
##     topTable
library(GSVA)
library(ComplexHeatmap)
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.18.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite either one:
## - Gu, Z. Complex Heatmap Visualization. iMeta 2022.
## - Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##     genomic data. Bioinformatics 2016.
## 
## 
## The new InteractiveComplexHeatmap package can directly export static 
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
## 
## Attaching package: 'ComplexHeatmap'
## The following object is masked from 'package:plotly':
## 
##     add_heatmap
theme_set(theme_classic())
theme_update(axis.text=element_text(size=12),axis.title=element_text(size=14,face="bold"),strip.text=element_text(size=12),legend.text = element_text(size=12),legend.title = element_text(size=14),legend.key.size = unit(0.3,"cm"),title=element_text(size=14),line=element_line(size=0.5),rect=element_rect(size=0.5),legend.margin=margin(0,0,0,0),legend.box.margin=margin(0,0,0,0))
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Plot PCA
format_pca = function(pca,metadata,merge_name){
  pca_coords = as.data.frame(pca$x)
  pca_coords[[merge_name]] = rownames(pca_coords)
  pca_coords = merge(pca_coords,metadata,all.x=TRUE)
  
  lab_pc = list()
  lab_pc = lapply(seq(1,length(pca$sdev)),function(x){
    lab_pc[[x]] = paste("PC",x,": ",round(pca$sdev[x]^2/sum(pca$sdev^2)*100,1),"%",sep="")
  })
  
  pca_loadings = as.data.frame(pca$rotation)
  pca_loadings$Element = rownames(pca_loadings)
  
  pca_list = list(pca_coords,lab_pc,pca_loadings)
  names(pca_list) = c("coords","labels","loadings")
  
  return(pca_list)
}

Load metadata

George et al (mostly primary tumor)

From George et al, Supplementary Table 1 (https://www.nature.com/articles/nature14664)

  • RNA-seq (81 samples)
  • WGS (110 samples)
metadata_george = read.xlsx("metadata/41586_2015_BFnature14664_MOESM72_ESM.xlsx",sheet="Suppl_Table1",startRow = 4,sep.names=" ")

metadata_george_wxs = metadata_george[which(metadata_george$`Whole genome sequencing` != "no"),]
metadata_george_wxs$`Sample-ID` = gsub(" ","",metadata_george_wxs$`Sample-ID`)

metadata_george = metadata_george[which(metadata_george$`RNA-seq` != "no"),]
metadata_george$`Sample-ID` = gsub(" ","",metadata_george$`Sample-ID`)
george_purity = read.xlsx("metadata/41586_2015_BFnature14664_MOESM72_ESM.xlsx",sheet="Suppl_Table2",startRow = 5,sep.names=" ")
george_purity = george_purity[which(george_purity$`sequencing method` %in% c("WGS","WGS (T), WES (N)")),]

Add placeholder for missing metadata

metadata_george[which(is.na(metadata_george$ethnicity)),]$ethnicity = "Unknown"

Lissa et al (mostly relapse/metastasis)

From:

  1. Lissa et al, Supplementary Tables 2 and Table 3, plus additional clinical data
  2. dbGaP (https://trace.ncbi.nlm.nih.gov/Traces/study/?acc=phs002541&o=acc_s%3Aa)
  3. Oncogenomics portal
# RNAseq sample links
metadata_lissa = read.xlsx("metadata/SCLC Lissa metadata.xlsx", sheet="Sample links",sep.names=" ")
colnames(metadata_lissa) = gsub("sample_id","NCI_sample_ID.RNA",gsub("Submitted to ","",colnames(metadata_lissa)))

# Supplementary Table 2
t2 = read.xlsx("metadata/SCLC Lissa metadata.xlsx", sheet="Supp Table 2",startRow = 3,sep.names=" ")[1:66,]
t2 = t2[,c("Sample id","Patient id","Biopsy site","Biopsy type")]

metadata_lissa = merge(metadata_lissa,t2,
                       by.x=c("dbGap sample ID","dbGap patient ID"),
                       by.y=c("Sample id","Patient id"),all=TRUE)

# Supplementary Table 3
t3 = read.xlsx("metadata/SCLC Lissa metadata.xlsx", sheet="Supp Table 3",startRow = 3,sep.names=" ")[1:100,]

metadata_lissa = merge(metadata_lissa,t3,
                       by.x=c("dbGap sample ID","dbGap patient ID"),
                       by.y=c("Sample id","Patient id"),all=TRUE)

# Additional clinical metadata
clinical = read.xlsx("metadata/Clinical characteritics.xlsx",startRow = 3)
clinical = clinical[,c("sample_id","Cohort","Race(s)")]

metadata_lissa = merge(metadata_lissa,clinical,
                       by.x="NCI_sample_ID.RNA",
                       by.y="sample_id",all.x=TRUE)

rm(list=c("t2","t3","clinical"))

RNA (100 samples)

# dbGaP
sra = read.xlsx("metadata/SCLC Lissa metadata.xlsx", sheet="SraRunTable",sep.names=" ")
sra = sra[which(sra$`Assay Type` == "RNA-Seq"),]
sra = sra[,c("Sample Name","submitted_subject_id","body_site","sex")]
sra$`Sample Name` = gsub("_RNA","",sra$`Sample Name`)

metadata_lissa = merge(metadata_lissa, sra, 
                       by.x=c("dbGap sample ID","dbGap patient ID"),
                       by.y=c("Sample Name","submitted_subject_id"))

# Clinomics portal
clinomics = read.xlsx("metadata/SCLC Lissa Clinomics portal.xlsx",sep.names=" ")
clinomics = clinomics[,c("Sample Name","Patient ID","Library Type","Diagnosis","FFPE or Fresh Frozen","RIN","DV200")]

metadata_lissa = merge(metadata_lissa,clinomics,
                       by.x=c("NCI_sample_ID.RNA","patient_id"),
                       by.y=c("Sample Name","Patient ID"),all=TRUE)

rm(list=c("sra","clinomics"))

WXS (34 tumor/normal pairs)

# Nobu WXS links
wxs_links = read.xlsx("metadata/Lissa_et_al_Nat_Commun_WES_sample_matching_NT20240111.xlsx")

# Clinomics portal
clinomics = read.xlsx("metadata/SCLC Lissa Clinomics portal WXS.xlsx",sep.names=" ")
clinomics = clinomics[,c("Sample Name","Patient ID","Library Type","Diagnosis","FFPE or Fresh Frozen",
                         "Tumor|Normal","Normal sample","Rnaseq sample")]

clinomics = merge(clinomics,wxs_links[,c("dbGaP_sample_ID","NCI_sample_ID")],
            by.x="Sample Name",
            by.y="NCI_sample_ID",all.y=TRUE)

# Merge tumor/normal pairs
clinomics = cbind(clinomics,colsplit(clinomics$dbGaP_sample_ID,"_",c("dbGaP sample ID","Material")))
tumor = clinomics[which(clinomics$Material == "DNA"),]
normal = clinomics[which(clinomics$Material == "PBMC"),]
clinomics = merge(tumor[,c("dbGaP sample ID","Sample Name")],
            normal[,c("dbGaP sample ID","Sample Name","Patient ID","Library Type","Diagnosis")],
            by=c("dbGaP sample ID"),all=TRUE)
colnames(clinomics) = mapvalues(colnames(clinomics),
                                c("Sample Name.x","Sample Name.y","Library Type","Diagnosis"),
                                c("NCI_sample_ID.tumor","NCI_sample_ID.normal","Library Type.WXS","Diagnosis.WXS"))

# Combine
metadata_lissa = merge(metadata_lissa, clinomics, 
                       by.x=c("dbGap sample ID","patient_id"),
                       by.y=c("dbGaP sample ID","Patient ID"),all=TRUE)

rm(list=c("wxs_links","clinomics","tumor","normal"))

Add placeholders for missing metadata; storage method: “FFPE” and library method: “RNA access” where unspecified

metadata_lissa[which(is.na(metadata_lissa$sex)),]$sex = "Unknown"
metadata_lissa[which(is.na(metadata_lissa$`Race(s)`)),]$`Race(s)` = "Unknown"
metadata_lissa[which(is.na(metadata_lissa$`Biopsy timepoint`)),]$`Biopsy timepoint` = "Unknown"
metadata_lissa[which(is.na(metadata_lissa$`Biopsy type`)),]$`Biopsy type` = "Unknown"
metadata_lissa[which(is.na(metadata_lissa$`FFPE or Fresh Frozen`)),]$`FFPE or Fresh Frozen` = "FFPE"
metadata_lissa[which(is.na(metadata_lissa$`Library Type`)),]$`Library Type` = "access"

Load gene expression

Raw counts and TPM generated using CCBR Pipeliner (RENEE) from:

  • George: fastq in /data/SCLCgenomics/globus_data/George_data/EGAD00001001244_RNAseq/fastq (total = 57 samples, excluding two samples with unusual read structure)
  • Lissa: fastq in /data/SCLCgenomics/SCLC_integrated_analysis/data/rna/thomas (total = 100 samples)

George

Load raw counts; remove “T” suffix to match metadata

george.raw = read.table("rnaseq/George/analysis/DEG_ALL/RSEM.genes.expected_count.all_samples.txt",sep='\t',header=TRUE)

rownames(george.raw) = george.raw$gene_id
george.raw = george.raw[,3:59]

colnames(george.raw) = gsub("T$","",colnames(george.raw))

Filter metadata to samples with fastq; specify library method: “polyA” for remaining samples (all from George et al original cohort)

metadata_george_filter = metadata_george[which(metadata_george$`Sample-ID` %in% colnames(george.raw)),]

metadata_george_filter$`Library Type` = "PolyA"

Load TPM, convert to log2

george.tpm = read.table("rnaseq/George/analysis/DEG_ALL/RSEM.genes.TPM.all_samples.txt",sep='\t',header=TRUE)

rownames(george.tpm) = george.tpm$gene_id
george.tpm = george.tpm[,3:59]

colnames(george.tpm) = gsub("T$","",colnames(george.tpm))

george.tpm = log2(george.tpm + 1)

Lissa

Load raw counts

lissa.raw = read.table("rnaseq/Lissa/analysis100/DEG_ALL/RSEM.genes.expected_count.all_samples.txt",sep='\t',header=TRUE)

genes = lissa.raw[,1:2]

rownames(lissa.raw) = lissa.raw$gene_id
lissa.raw = lissa.raw[,metadata_lissa$NCI_sample_ID.RNA]

Load TPM, convert to log2

lissa.tpm = read.table("rnaseq/Lissa/analysis100/DEG_ALL/RSEM.genes.TPM.all_samples.txt",sep='\t',header=TRUE)

rownames(lissa.tpm) = lissa.tpm$gene_id
lissa.tpm = lissa.tpm[,metadata_lissa$NCI_sample_ID.RNA]

lissa.tpm = log2(lissa.tpm + 1)

Combine raw counts

combine.raw = cbind(george.raw,lissa.raw)

metadata_combined = data.frame(Sample = c(metadata_george_filter$`Sample-ID`,metadata_lissa$NCI_sample_ID.RNA),
                               Cohort = c(rep("George",57),rep("Lissa",100)),
                               Timepoint = c(metadata_george_filter$`previous therapeutic treatment for SCLC`,metadata_lissa$`Biopsy timepoint`),
                               Storage = c(metadata_george_filter$Tumor,metadata_lissa$`FFPE or Fresh Frozen`),
                               Library = c(metadata_george_filter$`Library Type`,metadata_lissa$`Library Type`),
                               Ethnicity = c(metadata_george_filter$ethnicity,metadata_lissa$`Race(s)`))

Count normalization and voom

From raw counts, filter lowly expressed genes, perform TMM normalization and voom

George

# Create DGEList
george.dge = DGEList(counts=george.raw)

# Filter lowly expressed genes
design <- model.matrix( ~ `primary tumor/metastasis` + `previous therapeutic treatment for SCLC`, metadata_george_filter)
george.dge <- george.dge[filterByExpr(george.dge,design),, keep.lib.sizes=FALSE]

# TMM normalization
george.dge = calcNormFactors(george.dge,method="TMM")

# Voom normalization
george.voom <- voom(george.dge,design)
george.voom = george.voom$E

Lissa

# Create DGEList
lissa.dge = DGEList(counts=lissa.raw)

# Filter lowly expressed genes
design <- model.matrix( ~ Disease + `Biopsy timepoint`, metadata_lissa)
lissa.dge <- lissa.dge[filterByExpr(lissa.dge,design),, keep.lib.sizes=FALSE]

# TMM normalization
lissa.dge = calcNormFactors(lissa.dge,method="TMM")

# Voom normalization
lissa.voom <- voom(lissa.dge,design)
lissa.voom = lissa.voom$E

Combined

# Create DGEList
combine.dge = DGEList(counts=combine.raw)

# Filter lowly expressed genes
design <- model.matrix( ~ Cohort, metadata_combined)
combine.dge <- combine.dge[filterByExpr(combine.dge,design),, keep.lib.sizes=FALSE]

# TMM normalization
combine.dge = calcNormFactors(combine.dge,method="TMM")

# Voom normalization
combine.voom <- voom(combine.dge,design)
combine.voom = combine.voom$E

rm(list=c("george.raw","lissa.raw"))

Sources of variation

Identify major sources of variation in the combined and separate cohorts to determine whether batch correction should be performed.

Combined cohort

PCA

On 3k most variable genes

combine.voom.3k = combine.voom[order(apply(combine.voom,1,mad),decreasing = TRUE),][1:3000,]
combine.voom.pca = prcomp(t(combine.voom.3k),scale=TRUE,center=TRUE)
combine.voom.pca = format_pca(combine.voom.pca,metadata_combined,"Sample")

# 2D
ggplot(combine.voom.pca$coords,aes(x=PC1,y=PC2,color=Cohort)) + geom_point(size=2) + coord_fixed() + xlab(combine.voom.pca$labels[[1]]) + ylab(combine.voom.pca$labels[[2]]) + theme(aspect.ratio=1) + ggtitle("George/Lissa, by cohort")

# 2D
ggplot(combine.voom.pca$coords,aes(x=PC1,y=PC2,color=Library,shape=Cohort)) + geom_point(size=2) + coord_fixed() + xlab(combine.voom.pca$labels[[1]]) + ylab(combine.voom.pca$labels[[2]]) + theme(aspect.ratio=1) + ggtitle("George/Lissa, by library method / cohort")

# 3D
fig <- plot_ly(combine.voom.pca$coords, x = ~`PC1`, y = ~`PC2`, z = ~`PC3`,color = ~Library)
fig <- fig %>% add_markers()
fig <- fig %>% layout(title = "George/Lissa, by library method",
                    scene = list(xaxis = list(title = 'PC1'),
                    yaxis = list(title = 'PC2'),
                    zaxis = list(title = 'PC3')))
fig

Conclusion: The George et al and Lissa et al cohorts separate along PC1, driven primarily by library method. Combining the two cohorts will be difficult, as several technical variables are co-linear with the cohort (e.g., untreated primary tumor vs relapse/metastasis).

George

Variable co-linearity

Summary of variables and variable correlation

apply(metadata_george_filter[,c("sex", "age", "ethnicity", "Pathology review 2", "primary tumor/metastasis","previous therapeutic treatment for SCLC", "stage_UICC", "tissue sampling","Tumor")],2,function(x) table(x,useNA="ifany"))
## $sex
## x
## female   male 
##     15     42 
## 
## $age
## x
## 48 51 52 53 54 57 58 59 61 62 63 64 65 66 67 68 70 71 73 74 75 76 77 81 82 83 
##  1  1  1  2  1  2  2  2  5  1  7  5  3  1  2  5  4  1  1  2  1  2  1  1  1  2 
## 
## $ethnicity
## x
##     Asian Caucasian   Unknown 
##         1        27        29 
## 
## $`Pathology review 2`
## x
## SCLC 
##   57 
## 
## $`primary tumor/metastasis`
## x
## metastasis (pleura)             primary 
##                   1                  56 
## 
## $`previous therapeutic treatment for SCLC`
## x
##   relapse untreated 
##         1        56 
## 
## $stage_UICC
## x
##    I   Ia   Ib   IB  IIa  IIb IIIa IIIb   IV 
##    1   15    8    1    5    3   14    3    7 
## 
## $`tissue sampling`
## x
##             biopsy   pleural effusion surgical resection 
##                  1                  1                 55 
## 
## $Tumor
## x
## FF tissue  
##         57
table(metadata_george_filter$sex,metadata_george_filter$ethnicity)
##         
##          Asian Caucasian Unknown
##   female     1         3      11
##   male       0        24      18

Variance partitioning

Determine % of variance attributable to each variable by gene (for 3k most variable genes)

george.voom.3k = george.voom[order(apply(george.voom,1,mad),decreasing = TRUE),][1:3000,]
form <- ~ `primary tumor/metastasis` + `previous therapeutic treatment for SCLC`

varPart <- fitExtractVarPartModel(george.voom.3k,form,metadata_george_filter)
## Warning in filterInputData(exprObj, formula, data, useWeights = useWeights): Sample names of responses (i.e. columns of exprObj) do not match
## sample names of metadata (i.e. rows of data).  Recommend consistent
## names so downstream results are labeled consistently.
vp = melt(varPart,variable.name="Variable",value.name="Variance")
## No id variables; using all as measure variables
vp$Variance = 100*vp$Variance

ggplot(vp,aes(x=reorder(Variable,Variance,median),y=Variance)) + geom_boxplot(outlier.size=0.5,size=0.5) + ylab("Variance (%)") + xlab("Source of variation") + theme(axis.text.x=element_text(angle=45,hjust=1,vjust=1)) + ggtitle("George")

PCA

george.voom.pca = prcomp(t(george.voom.3k),scale=TRUE,center=TRUE)
george.voom.pca = format_pca(george.voom.pca,metadata_george_filter,"Sample-ID")

# 2D
ggplot(george.voom.pca$coords,aes(x=PC1,y=PC2,color=`primary tumor/metastasis`)) + geom_point(size=2) + coord_fixed() + xlab(george.voom.pca$labels[[1]]) + ylab(george.voom.pca$labels[[2]]) + theme(aspect.ratio=1) + ggtitle("George, by primary tumor vs. metastasis")

# 3D
fig <- plot_ly(george.voom.pca$coords, x = ~`PC1`, y = ~`PC2`, z = ~`PC3`,color = ~stage_UICC)
fig <- fig %>% add_markers()
fig <- fig %>% layout(title = "George, by stage UICC",
                    scene = list(xaxis = list(title = 'PC1'),
                    yaxis = list(title = 'PC2'),
                    zaxis = list(title = 'PC3')))
fig
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

Conclusion: The George et al cohort is relatively standardized, and little variation is attributable to the few metastasis / relapse samples.

Lissa

Variable co-linearity

Summary of variables and variable correlation

apply(metadata_lissa[,c("patient_id", "Cohort", "Race(s)", "sex", "Disease", "Age at diagnosis", "SCLC staging/initial diagnosis","body_site", "Biopsy type", "Biopsy timepoint","Library Type", "FFPE or Fresh Frozen")],2,function(x) table(x,useNA="ifany"))
## $patient_id
## x
## 4525930 7857901 7877742 7925529 7926637 7944275 7953756 7954499 7954517 7959576 
##       1       2       1       1       2       2       1       1       1       2 
## 7973317 8022288  CL0083  CL0106  CL0107  CL0108  CL0109  CL0110  CL0111  CL0114 
##       1       1       1       2       1       2       1       1       1       1 
##  CL0116  CL0124  CL0125  CL0126  CL0146  CL0147  CL0148  CL0152  CL0157  CL0164 
##       3       2       2       2       1       3       1       1       1       1 
##  CL0169  CL0170  CL0172  CL0173  CL0174  CL0176  CL0180  CL0186  CL0191  CL0193 
##       1       1       1       1       1       1       2       2       1       1 
##  CL0196  CL0214  CL0230  CL0261  CL0267 NCI0402 NCI0422 ucmc_11 ucmc_12 ucmc_13 
##       1       1       2       1       1       2       3       1       2       1 
## ucmc_14 ucmc_15 ucmc_16 ucmc_17 ucmc_18 ucmc_19  ucmc_2 ucmc_21 ucmc_23 ucmc_24 
##       1       3       2       1       2       1       1       1       2       1 
## ucmc_25 ucmc_26  ucmc_3 ucmc_30 ucmc_31 ucmc_33 ucmc_35  ucmc_5  ucmc_6  ucmc_7 
##       2       1       1       1       2       1       1       2       1       1 
##  ucmc_8  ucmc_9 
##       1       1 
## 
## $Cohort
## x
##       NCI Rochester 
##        66        34 
## 
## $`Race(s)`
## x
##                     Asian Black or African American                   Unknown 
##                         1                         9                         2 
##                     White 
##                        88 
## 
## $sex
## x
##  female    male Unknown 
##      54      44       2 
## 
## $Disease
## x
##       EP_bladder        EP_cervix EP_EGFRmt_trSCLC         EP_ovary 
##                1                3                4                1 
##      EP_prostate        EP_rectum    EP_subglottis             SCLC 
##                1                1                1               88 
## 
## $`Age at diagnosis`
## x
## 29 37 39 41 50 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 
##  2  2  2  1  2  2  1  7  1  3  5  3  8  4  4  5  6  3  4  3  1  6  4  1  7  1 
## 74 75 77 84 86 
##  1  8  1  1  1 
## 
## $`SCLC staging/initial diagnosis`
## x
## Extensive stage   Limited stage 
##              74              26 
## 
## $body_site
## x
##              Adrenal gland        Axillary lymph node 
##                          5                          1 
##                    Bladder                       Bone 
##                          1                          2 
##                      Brain        Inguinal lymph node 
##                          2                          2 
##                      Liver                       Lung 
##                         29                         22 
##     Mediastinal lymph node                Pelvic mass 
##                         17                          2 
##              Pleural fluid                   Prostate 
##                          4                          1 
##                 Subglottis Supraclavicular lymph node 
##                          1                         10 
##                    Uterine 
##                          1 
## 
## $`Biopsy type`
## x
## Core biopsy    Excision        FNAC     Unknown 
##          51          11           4          34 
## 
## $`Biopsy timepoint`
## x
## Initial diagnosis           Relapse           Unknown 
##                15                84                 1 
## 
## $`Library Type`
## x
##         access polya_stranded       SmartRNA 
##             95              3              2 
## 
## $`FFPE or Fresh Frozen`
## x
##         FFPE Fresh Frozen       frozen 
##           89            1           10
canCorPairs( ~ Cohort + `Race(s)` + sex + Disease + `SCLC staging/initial diagnosis` + body_site + `Biopsy type` + `Biopsy timepoint` + `Library Type` + `FFPE or Fresh Frozen`, metadata_lissa)
## Warning in canCorPairs(~Cohort + `Race(s)` + sex + Disease + `SCLC staging/initial diagnosis` + : Regression model may be problematic.
## High colinearity between variables:
##   Cohort and `Biopsy type`
##                                     Cohort  `Race(s)`        sex    Disease
## Cohort                           1.0000000 0.22257979 0.21098641 0.26504327
## `Race(s)`                        0.2225798 1.00000000 0.72677313 0.33446491
## sex                              0.2109864 0.72677313 1.00000000 0.24274955
## Disease                          0.2650433 0.33446491 0.24274955 1.00000000
## `SCLC staging/initial diagnosis` 0.3291866 0.16404733 0.09023723 0.36652389
## body_site                        0.6260669 0.40709374 0.43934994 0.80632823
## `Biopsy type`                    1.0000000 0.28842033 0.15818943 0.43279239
## `Biopsy timepoint`               0.3273268 0.07194976 0.13117847 0.25981803
## `Library Type`                   0.1646610 0.41252863 0.10007676 0.24154358
## `FFPE or Fresh Frozen`           0.2523300 0.31285404 0.10189629 0.09179851
##                                  `SCLC staging/initial diagnosis` body_site
## Cohort                                                 0.32918659 0.6260669
## `Race(s)`                                              0.16404733 0.4070937
## sex                                                    0.09023723 0.4393499
## Disease                                                0.36652389 0.8063282
## `SCLC staging/initial diagnosis`                       1.00000000 0.5723118
## body_site                                              0.57231178 1.0000000
## `Biopsy type`                                          0.37660669 0.5880126
## `Biopsy timepoint`                                     0.20445731 0.3996479
## `Library Type`                                         0.18263411 0.4005026
## `FFPE or Fresh Frozen`                                 0.17335539 0.3451468
##                                  `Biopsy type` `Biopsy timepoint`
## Cohort                               1.0000000         0.32732684
## `Race(s)`                            0.2884203         0.07194976
## sex                                  0.1581894         0.13117847
## Disease                              0.4327924         0.25981803
## `SCLC staging/initial diagnosis`     0.3766067         0.20445731
## body_site                            0.5880126         0.39964789
## `Biopsy type`                        1.0000000         0.32027102
## `Biopsy timepoint`                   0.3202710         1.00000000
## `Library Type`                       0.1799431         0.11197140
## `FFPE or Fresh Frozen`               0.2085420         0.05258182
##                                  `Library Type` `FFPE or Fresh Frozen`
## Cohort                                0.1646610             0.25232997
## `Race(s)`                             0.4125286             0.31285404
## sex                                   0.1000768             0.10189629
## Disease                               0.2415436             0.09179851
## `SCLC staging/initial diagnosis`      0.1826341             0.17335539
## body_site                             0.4005026             0.34514679
## `Biopsy type`                         0.1799431             0.20854198
## `Biopsy timepoint`                    0.1119714             0.05258182
## `Library Type`                        1.0000000             0.47524569
## `FFPE or Fresh Frozen`                0.4752457             1.00000000
table(metadata_lissa$Cohort,metadata_lissa$`Biopsy type`)
##            
##             Core biopsy Excision FNAC Unknown
##   NCI                51       11    4       0
##   Rochester           0        0    0      34

Variance partitioning

Determine % of variance attributable to each variable by gene (for 3k most variable genes)

lissa.voom.3k = lissa.voom[order(apply(lissa.voom,1,mad),decreasing = TRUE),][1:3000,]
form <- ~ Cohort + Disease + `Biopsy timepoint` + `Library Type` + `FFPE or Fresh Frozen`

varPart <- fitExtractVarPartModel(lissa.voom.3k,form,metadata_lissa)
## Warning in filterInputData(exprObj, formula, data, useWeights = useWeights): Sample names of responses (i.e. columns of exprObj) do not match
## sample names of metadata (i.e. rows of data).  Recommend consistent
## names so downstream results are labeled consistently.
vp = melt(varPart,variable.name="Variable",value.name="Variance")
## No id variables; using all as measure variables
vp$Variance = 100*vp$Variance

ggplot(vp,aes(x=reorder(Variable,Variance,median),y=Variance)) + geom_boxplot(outlier.size=0.5,size=0.5) + ylab("Variance (%)") + xlab("Source of variation") + theme(axis.text.x=element_text(angle=45,hjust=1,vjust=1)) + ggtitle("Lissa")

form <- ~ patient_id + `FFPE or Fresh Frozen`

varPart <- fitExtractVarPartModel(lissa.voom.3k,form,metadata_lissa)
## Warning in filterInputData(exprObj, formula, data, useWeights = useWeights): Sample names of responses (i.e. columns of exprObj) do not match
## sample names of metadata (i.e. rows of data).  Recommend consistent
## names so downstream results are labeled consistently.
vp = melt(varPart,variable.name="Variable",value.name="Variance")
## No id variables; using all as measure variables
vp$Variance = 100*vp$Variance

ggplot(vp,aes(x=reorder(Variable,Variance,median),y=Variance)) + geom_boxplot(outlier.size=0.5,size=0.5) + ylab("Variance (%)") + xlab("Source of variation") + theme(axis.text.x=element_text(angle=45,hjust=1,vjust=1)) + ggtitle("Lissa")

PCA

lissa.voom.pca = prcomp(t(lissa.voom.3k),scale=TRUE,center=TRUE)
lissa.voom.pca = format_pca(lissa.voom.pca,metadata_lissa,"NCI_sample_ID.RNA")

# 2D
ggplot(lissa.voom.pca$coords,aes(x=PC1,y=PC2,color=`Biopsy timepoint`)) + geom_point(size=2) + coord_fixed() + xlab(lissa.voom.pca$labels[[1]]) + ylab(lissa.voom.pca$labels[[2]]) + theme(aspect.ratio=1) + ggtitle("Lissa, by timepoint")

# 2D
ggplot(lissa.voom.pca$coords,aes(x=PC1,y=PC2,color=Cohort)) + geom_point(size=2) + coord_fixed() + xlab(lissa.voom.pca$labels[[1]]) + ylab(lissa.voom.pca$labels[[2]]) + theme(aspect.ratio=1) + ggtitle("Lissa, by cohort")

# 2D
ggplot(lissa.voom.pca$coords,aes(x=PC1,y=PC2,color=`Library Type`)) + geom_point(size=2) + coord_fixed() + xlab(lissa.voom.pca$labels[[1]]) + ylab(lissa.voom.pca$labels[[2]]) + theme(aspect.ratio=1) + ggtitle("Lissa, by library method")

# 2D
ggplot(lissa.voom.pca$coords,aes(x=PC1,y=PC2,color=`FFPE or Fresh Frozen`)) + geom_point(size=2) + coord_fixed() + xlab(lissa.voom.pca$labels[[1]]) + ylab(lissa.voom.pca$labels[[2]]) + theme(aspect.ratio=1) + ggtitle("Lissa, by storage method") 

# 3D
fig <- plot_ly(lissa.voom.pca$coords, x = ~`PC1`, y = ~`PC2`, z = ~`PC3`,color = ~body_site)
fig <- fig %>% add_markers()
fig <- fig %>% layout(title = "Lissa, by body site",
                    scene = list(xaxis = list(title = 'PC1'),
                    yaxis = list(title = 'PC2'),
                    zaxis = list(title = 'PC3')))
fig
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

Conclusion: The Lissa et al cohort exhibits greater technical and biological variability. Technical variables such as site, storage, or library method have a similar contribution to variance as disease. However, variation due to storage method is small compared to that by patient. The sampled body site separates samples along PC1 (primarily liver vs. lung/lymph node).

CENPA expression

CENPA expression

Extract CENPA expression (log2TPM) and split samples into CENPA-high/low based on median expression

George

metadata_george_filter$CENPA.tpm = unlist(george.tpm["ENSG00000115163.15",metadata_george_filter$`Sample-ID`])

summary(metadata_george_filter$CENPA.tpm)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.7137  4.6159  5.0921  4.9319  5.4754  6.5283
metadata_george_filter$CENPA.category = ifelse(metadata_george_filter$CENPA.tpm > median(metadata_george_filter$CENPA.tpm),"High","Low")

Lissa

metadata_lissa$CENPA.tpm = unlist(lissa.tpm["ENSG00000115163.15",metadata_lissa$NCI_sample_ID.RNA])

summary(metadata_lissa$CENPA.tpm)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   2.008   2.974   2.780   3.570   4.980
metadata_lissa$CENPA.category = ifelse(metadata_lissa$CENPA.tpm > median(metadata_lissa$CENPA.tpm),"High","Low")

CES score (31-gene)

ces = c("CENPA","HJURP","MIS18BP1","MIS18A","OIP5","CENPC","CENPN","CENPI","CENPH","CENPT","CENPW","CENPS","CENPX","CENPM","CENPU","CENPL","CENPK","CENPO","CENPP","CENPQ","ITGB3BP","KNL1","ZWINT","MIS12","PMF1","NSL1","DSN1","NDC80","SPC24","SPC25","NUF2")

Calculate CES score as the sum of the log2(TPM) per gene and split samples into CES-high/low based on median expression

George

metadata_george_filter$CES.tpm = colSums(george.tpm[genes[match(ces,genes$GeneName),]$gene_id,metadata_george_filter$`Sample-ID`])

metadata_george_filter$CES.category = ifelse(metadata_george_filter$CES.tpm > median(metadata_george_filter$CES.tpm),"High","Low")

Lissa

metadata_lissa$CES.tpm = colSums(lissa.tpm[genes[match(ces,genes$GeneName),]$gene_id,metadata_lissa$NCI_sample_ID.RNA])

metadata_lissa$CES.category = ifelse(metadata_lissa$CES.tpm > median(metadata_lissa$CES.tpm),"High","Low")

Compare CENPA and CES

# Score vs score
ggplot(metadata_george_filter,aes(x=CENPA.tpm,y=CES.tpm)) + geom_point(aes(color=CENPA.category,shape=CES.category)) + 
  geom_vline(xintercept=median(metadata_george_filter$CENPA.tpm),linetype="dashed") +
  geom_hline(yintercept=median(metadata_george_filter$CES.tpm),linetype="dashed") + 
  ggtitle("George, CENPA vs. CES") + xlab("CENPA expression (log2TPM)") + ylab("CES (log2TPM)") +
  scale_color_discrete(name="CENPA category") + scale_shape_discrete(name="CES category") + 
  geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

cor.test(metadata_george_filter$CENPA.tpm,metadata_george_filter$CES.tpm,method="spearman")
## 
##  Spearman's rank correlation rho
## 
## data:  metadata_george_filter$CENPA.tpm and metadata_george_filter$CES.tpm
## S = 7908, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.7437127
# Category vs category
table(metadata_george_filter$CENPA.category,metadata_george_filter$CES.category)
##       
##        High Low
##   High   24   4
##   Low     4  25
chisq.test(metadata_george_filter$CENPA.category,metadata_george_filter$CES.category)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  metadata_george_filter$CENPA.category and metadata_george_filter$CES.category
## X-squared = 26.677, df = 1, p-value = 2.405e-07
# Score vs score
ggplot(metadata_lissa,aes(x=CENPA.tpm,y=CES.tpm)) + geom_point(aes(color=CENPA.category,shape=CES.category)) +
  geom_vline(xintercept=median(metadata_lissa$CENPA.tpm),linetype="dashed") +
  geom_hline(yintercept=median(metadata_lissa$CES.tpm),linetype="dashed") + 
  ggtitle("Lissa, CENPA vs. CES") + xlab("CENPA expression (log2TPM)") + ylab("CES (log2TPM)") +
  scale_color_discrete(name="CENPA category") + scale_shape_discrete(name="CES category") + 
  geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

cor.test(metadata_lissa$CENPA.tpm,metadata_lissa$CES.tpm,method="spearman")
## Warning in cor.test.default(metadata_lissa$CENPA.tpm, metadata_lissa$CES.tpm, :
## Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  metadata_lissa$CENPA.tpm and metadata_lissa$CES.tpm
## S = 23002, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.8619768
# Category vs category
table(metadata_lissa$CENPA.category,metadata_lissa$CES.category)
##       
##        High Low
##   High   40  10
##   Low    10  40
chisq.test(metadata_lissa$CENPA.category,metadata_lissa$CES.category)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  metadata_lissa$CENPA.category and metadata_lissa$CES.category
## X-squared = 33.64, df = 1, p-value = 6.631e-09

Conclusion: There is a strong correlation between CENP-A expression and the CES score in both cohorts.

CES vs metadata

Compare CES score to technical and clinical metadata for each cohort.

George

apply(metadata_george_filter[,c("sex","primary tumor/metastasis","previous therapeutic treatment for SCLC","chemotherapy NEO-ADJUVANT (yes/no)","chemotherapy (yes/no)")],2,function(x) wilcox.test(metadata_george_filter$CES.tpm~x))
## $sex
## 
##  Wilcoxon rank sum exact test
## 
## data:  metadata_george_filter$CES.tpm by x
## W = 345, p-value = 0.5966
## alternative hypothesis: true location shift is not equal to 0
## 
## 
## $`primary tumor/metastasis`
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  metadata_george_filter$CES.tpm by x
## W = 17, p-value = 0.5233
## alternative hypothesis: true location shift is not equal to 0
## 
## 
## $`previous therapeutic treatment for SCLC`
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  metadata_george_filter$CES.tpm by x
## W = 37, p-value = 0.6054
## alternative hypothesis: true location shift is not equal to 0
## 
## 
## $`chemotherapy NEO-ADJUVANT (yes/no)`
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  metadata_george_filter$CES.tpm by x
## W = 41, p-value = 0.1863
## alternative hypothesis: true location shift is not equal to 0
## 
## 
## $`chemotherapy (yes/no)`
## 
##  Wilcoxon rank sum exact test
## 
## data:  metadata_george_filter$CES.tpm by x
## W = 198, p-value = 0.7118
## alternative hypothesis: true location shift is not equal to 0
apply(metadata_george_filter[,c("ethnicity","stage_UICC","smoking_status","tissue sampling","radiation (yes/no)")],2,function(x) kruskal.test(metadata_george_filter$CES.tpm,x))
## $ethnicity
## 
##  Kruskal-Wallis rank sum test
## 
## data:  metadata_george_filter$CES.tpm and x
## Kruskal-Wallis chi-squared = 0.86092, df = 2, p-value = 0.6502
## 
## 
## $stage_UICC
## 
##  Kruskal-Wallis rank sum test
## 
## data:  metadata_george_filter$CES.tpm and x
## Kruskal-Wallis chi-squared = 9.2312, df = 8, p-value = 0.3232
## 
## 
## $smoking_status
## 
##  Kruskal-Wallis rank sum test
## 
## data:  metadata_george_filter$CES.tpm and x
## Kruskal-Wallis chi-squared = 2.856, df = 5, p-value = 0.7222
## 
## 
## $`tissue sampling`
## 
##  Kruskal-Wallis rank sum test
## 
## data:  metadata_george_filter$CES.tpm and x
## Kruskal-Wallis chi-squared = 0.73348, df = 2, p-value = 0.693
## 
## 
## $`radiation (yes/no)`
## 
##  Kruskal-Wallis rank sum test
## 
## data:  metadata_george_filter$CES.tpm and x
## Kruskal-Wallis chi-squared = 3.5903, df = 2, p-value = 0.1661
cor.test(metadata_george_filter$CES.tpm,as.numeric(metadata_george_filter$age))
## 
##  Pearson's product-moment correlation
## 
## data:  metadata_george_filter$CES.tpm and as.numeric(metadata_george_filter$age)
## t = -0.87346, df = 55, p-value = 0.3862
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.3663703  0.1481129
## sample estimates:
##       cor 
## -0.116969

Conclusion: No metadata is correlated with CES score in the George et al primary tumor cohort.

Lissa

apply(metadata_lissa[,c("Cohort","Trial_ATRi","Trial_IC+ PARPi","Trial_IC","SCLC staging/initial diagnosis","Smoking history","Platinum sensitivity","IO prior to biopsy")],2,function(x) wilcox.test(metadata_lissa$CES.tpm~x))
## $Cohort
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  metadata_lissa$CES.tpm by x
## W = 1163, p-value = 0.7682
## alternative hypothesis: true location shift is not equal to 0
## 
## 
## $Trial_ATRi
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  metadata_lissa$CES.tpm by x
## W = 1056, p-value = 0.4928
## alternative hypothesis: true location shift is not equal to 0
## 
## 
## $`Trial_IC+ PARPi`
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  metadata_lissa$CES.tpm by x
## W = 986, p-value = 0.6329
## alternative hypothesis: true location shift is not equal to 0
## 
## 
## $Trial_IC
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  metadata_lissa$CES.tpm by x
## W = 1163, p-value = 0.7682
## alternative hypothesis: true location shift is not equal to 0
## 
## 
## $`SCLC staging/initial diagnosis`
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  metadata_lissa$CES.tpm by x
## W = 766, p-value = 0.1245
## alternative hypothesis: true location shift is not equal to 0
## 
## 
## $`Smoking history`
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  metadata_lissa$CES.tpm by x
## W = 229, p-value = 0.01477
## alternative hypothesis: true location shift is not equal to 0
## 
## 
## $`Platinum sensitivity`
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  metadata_lissa$CES.tpm by x
## W = 1287, p-value = 0.3877
## alternative hypothesis: true location shift is not equal to 0
## 
## 
## $`IO prior to biopsy`
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  metadata_lissa$CES.tpm by x
## W = 796, p-value = 0.4089
## alternative hypothesis: true location shift is not equal to 0
apply(metadata_lissa[,c("body_site","sex","Biopsy site","Biopsy type","Disease","Biopsy timepoint","Library Type","Diagnosis","FFPE or Fresh Frozen","Race(s)")],2,function(x) kruskal.test(metadata_lissa$CES.tpm,x))
## $body_site
## 
##  Kruskal-Wallis rank sum test
## 
## data:  metadata_lissa$CES.tpm and x
## Kruskal-Wallis chi-squared = 16.451, df = 14, p-value = 0.2866
## 
## 
## $sex
## 
##  Kruskal-Wallis rank sum test
## 
## data:  metadata_lissa$CES.tpm and x
## Kruskal-Wallis chi-squared = 3.9572, df = 2, p-value = 0.1383
## 
## 
## $`Biopsy site`
## 
##  Kruskal-Wallis rank sum test
## 
## data:  metadata_lissa$CES.tpm and x
## Kruskal-Wallis chi-squared = 34.569, df = 30, p-value = 0.2587
## 
## 
## $`Biopsy type`
## 
##  Kruskal-Wallis rank sum test
## 
## data:  metadata_lissa$CES.tpm and x
## Kruskal-Wallis chi-squared = 1.3415, df = 3, p-value = 0.7193
## 
## 
## $Disease
## 
##  Kruskal-Wallis rank sum test
## 
## data:  metadata_lissa$CES.tpm and x
## Kruskal-Wallis chi-squared = 8.557, df = 7, p-value = 0.286
## 
## 
## $`Biopsy timepoint`
## 
##  Kruskal-Wallis rank sum test
## 
## data:  metadata_lissa$CES.tpm and x
## Kruskal-Wallis chi-squared = 3.5726, df = 2, p-value = 0.1676
## 
## 
## $`Library Type`
## 
##  Kruskal-Wallis rank sum test
## 
## data:  metadata_lissa$CES.tpm and x
## Kruskal-Wallis chi-squared = 7.3679, df = 2, p-value = 0.02512
## 
## 
## $Diagnosis
## 
##  Kruskal-Wallis rank sum test
## 
## data:  metadata_lissa$CES.tpm and x
## Kruskal-Wallis chi-squared = 6.4716, df = 6, p-value = 0.3725
## 
## 
## $`FFPE or Fresh Frozen`
## 
##  Kruskal-Wallis rank sum test
## 
## data:  metadata_lissa$CES.tpm and x
## Kruskal-Wallis chi-squared = 2.188, df = 2, p-value = 0.3349
## 
## 
## $`Race(s)`
## 
##  Kruskal-Wallis rank sum test
## 
## data:  metadata_lissa$CES.tpm and x
## Kruskal-Wallis chi-squared = 3.3901, df = 3, p-value = 0.3353
apply(metadata_lissa[,c("Age at diagnosis","Number of systemic therapy","Age at biopsy timepoint")],2,function(x) cor.test(metadata_lissa$CES.tpm,as.numeric(x)))
## $`Age at diagnosis`
## 
##  Pearson's product-moment correlation
## 
## data:  metadata_lissa$CES.tpm and as.numeric(x)
## t = 0.17236, df = 98, p-value = 0.8635
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1796240  0.2130977
## sample estimates:
##        cor 
## 0.01740826 
## 
## 
## $`Number of systemic therapy`
## 
##  Pearson's product-moment correlation
## 
## data:  metadata_lissa$CES.tpm and as.numeric(x)
## t = -0.91157, df = 98, p-value = 0.3642
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2830154  0.1066443
## sample estimates:
##         cor 
## -0.09169449 
## 
## 
## $`Age at biopsy timepoint`
## 
##  Pearson's product-moment correlation
## 
## data:  metadata_lissa$CES.tpm and as.numeric(x)
## t = 0.088373, df = 97, p-value = 0.9298
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1887736  0.2060195
## sample estimates:
##         cor 
## 0.008972589
ggplot(metadata_lissa,aes(x=`Smoking history`,y=CES.tpm)) + geom_boxplot() + 
  ylab("CES expression (log2TPM)")

ggplot(metadata_lissa,aes(x=`Library Type`,y=CES.tpm)) + geom_boxplot() + 
  ylab("CES expression (log2TPM)")

Conclusion: There is an association between CES score and smoking history and library method in the Lissa et al relapse cohort.

Subtyping

NE signature

Calculate 50-gene signature indicating neuroendocrine differentiation (-1 to 1, non-NE vs. NE differentiation) and divide into NE (>0) and non-NE (<0)

ne50_UP = c("BEX1", "ASCL1", "INSM1", "CHGA","TAGLN3", "KIF5C", "CRMP1", "SCG3","SYT4", "RTN1", "MYT1", "SYP","KIF1A", "TMSB15A", "SYN1","SYT11", "RUNDC3A", "TFF3","CHGB", "FAM57B", "SH3GL2", "BSN","SEZ6", "TMSB15B", "CELF3")
ne50_DN = c("RAB27B", "TGFBR2", "SLC16A5","S100A10", "ITGB4", "YAP1", "LGALS3","EPHA2", "S100A16", "PLAU", "ABCC3","ARHGDIB", "CYR61", "PTGES", "CCND1","IFITM2", "IFITM3", "AHNAK", "CAV2","TACSTD2", "TGFBI", "EMP1", "CAV1","ANXA1", "MYOF")

ne50 = list(ne50_UP = genes[match(ne50_UP,genes$GeneName),]$gene_id,
           ne50_DN = genes[match(ne50_DN,genes$GeneName),]$gene_id)

George

george.ne50 <- gsva(as.matrix(george.tpm), ne50, method="ssgsea")
## Warning: Calling gsva(expr=., gset.idx.list=., method=., ...) is deprecated;
## use a method-specific parameter object (see '?gsva').
## Warning in .filterFeatures(expr, method): 10827 genes with constant expression
## values throughout the samples.
## Estimating ssGSEA scores for 2 gene sets.
## [1] "Calculating ranks..."
## [1] "Calculating absolute values from ranks..."
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=                                                                     |   2%
  |                                                                            
  |==                                                                    |   4%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |=======                                                               |  11%
  |                                                                            
  |=========                                                             |  12%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |============                                                          |  18%
  |                                                                            
  |==============                                                        |  19%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |=================                                                     |  25%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |====================                                                  |  28%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |=========================                                             |  35%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |===========================                                           |  39%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |=============================                                         |  42%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |=========================================                             |  58%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |=============================================                         |  65%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |================================================                      |  68%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |==================================================                    |  72%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |=====================================================                 |  75%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |========================================================              |  81%
  |                                                                            
  |==========================================================            |  82%
  |                                                                            
  |===========================================================           |  84%
  |                                                                            
  |============================================================          |  86%
  |                                                                            
  |=============================================================         |  88%
  |                                                                            
  |===============================================================       |  89%
  |                                                                            
  |================================================================      |  91%
  |                                                                            
  |=================================================================     |  93%
  |                                                                            
  |==================================================================    |  95%
  |                                                                            
  |====================================================================  |  96%
  |                                                                            
  |===================================================================== |  98%
  |                                                                            
  |======================================================================| 100%
## 
## [1] "Normalizing..."
george.ne50 = as.data.frame(t(george.ne50))
george.ne50$ne50_combined = george.ne50$ne50_UP-george.ne50$ne50_DN

metadata_george_filter$NE50.score = unlist(george.ne50[match(metadata_george_filter$`Sample-ID`,rownames(george.ne50)),]$ne50_combined)
metadata_george_filter$NE50.subtype = ifelse(metadata_george_filter$NE50.score > 0,"NE","Non-NE")

table(metadata_george_filter$NE50.subtype)
## 
##     NE Non-NE 
##     42     15

Compare to NE50 score from Lissa et al Figure S7b source data

subtype.figs7b = read.xlsx("metadata/41467_2022_29517_MOESM9_ESM.xlsx",sheet="Fig S7b")
subtype.figs7b = merge(subtype.figs7b,metadata_george_filter[,c("Sample-ID","NE50.score")],by.x="Sample_id",by.y="Sample-ID")

ggplot(subtype.figs7b,aes(x=NE_Score,y=NE50.score)) + geom_point() + xlab("Lissa NE50 score") + ylab("NE50 score") + 
  geom_vline(xintercept=0,linetype="dashed") +
  geom_hline(yintercept=0,linetype="dashed") 

Conclusion: There is some variability between the earlier NE50 score calculation and that calculated here for the George cohort.

Lissa

lissa.ne50 <- gsva(as.matrix(lissa.tpm), ne50, method="ssgsea")
## Warning: Calling gsva(expr=., gset.idx.list=., method=., ...) is deprecated;
## use a method-specific parameter object (see '?gsva').
## Warning in .filterFeatures(expr, method): 9030 genes with constant expression
## values throughout the samples.
## Estimating ssGSEA scores for 2 gene sets.
## [1] "Calculating ranks..."
## [1] "Calculating absolute values from ranks..."
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
  |=                                                                     |   2%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |==========                                                            |  15%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |=============                                                         |  19%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |===============                                                       |  22%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |====================                                                  |  28%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |===========================                                           |  39%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |=============================                                         |  42%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |==================================                                    |  48%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |====================================                                  |  52%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |======================================                                |  55%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |=========================================                             |  58%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |===========================================                           |  62%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |==============================================                        |  65%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |================================================                      |  68%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |==================================================                    |  71%
  |                                                                            
  |==================================================                    |  72%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |=====================================================                 |  76%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |=======================================================               |  78%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |========================================================              |  80%
  |                                                                            
  |=========================================================             |  81%
  |                                                                            
  |=========================================================             |  82%
  |                                                                            
  |==========================================================            |  83%
  |                                                                            
  |===========================================================           |  84%
  |                                                                            
  |============================================================          |  85%
  |                                                                            
  |============================================================          |  86%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |==============================================================        |  88%
  |                                                                            
  |==============================================================        |  89%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |================================================================      |  91%
  |                                                                            
  |================================================================      |  92%
  |                                                                            
  |=================================================================     |  93%
  |                                                                            
  |==================================================================    |  94%
  |                                                                            
  |==================================================================    |  95%
  |                                                                            
  |===================================================================   |  96%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |===================================================================== |  98%
  |                                                                            
  |===================================================================== |  99%
  |                                                                            
  |======================================================================| 100%
## 
## [1] "Normalizing..."
lissa.ne50 = as.data.frame(t(lissa.ne50))
lissa.ne50$ne50_combined = lissa.ne50$ne50_UP-lissa.ne50$ne50_DN

metadata_lissa$NE50.score = unlist(lissa.ne50[match(metadata_lissa$NCI_sample_ID.RNA,rownames(lissa.ne50)),]$ne50_combined)
metadata_lissa$NE50.subtype = ifelse(metadata_lissa$NE50.score > 0,"NE","Non-NE")

table(metadata_lissa$Disease,metadata_lissa$NE50.subtype)
##                   
##                    NE Non-NE
##   EP_bladder        0      1
##   EP_cervix         1      2
##   EP_EGFRmt_trSCLC  2      2
##   EP_ovary          0      1
##   EP_prostate       1      0
##   EP_rectum         0      1
##   EP_subglottis     0      1
##   SCLC             52     36

Compare to NE50 subtype from Lissa et al Figure 2f source data

subtype.fig2f = read.xlsx("metadata/41467_2022_29517_MOESM9_ESM.xlsx",sheet="Fig 2f")
subtype.fig2f = merge(subtype.fig2f,metadata_lissa[,c("dbGap sample ID","NE50.subtype")],by.x="Sample_id",by.y="dbGap sample ID")

table(subtype.fig2f$Subtype,subtype.fig2f$NE50.subtype)
##         
##          NE Non-NE
##   NE     39     11
##   Non NE  5     17

Conclusion: There is some variability between the earlier NE50 score calculation and that calculated here for the Lissa cohort.

Compare to CENPA

# Score vs score
ggplot(metadata_george_filter,aes(x=CENPA.tpm,y=NE50.score)) + geom_point(aes(color=CENPA.category,shape=NE50.subtype)) +
  geom_vline(xintercept=median(metadata_george_filter$CENPA.tpm),linetype="dashed") +
  geom_hline(yintercept=0,linetype="dashed") + 
  ggtitle("George, CENPA vs. NE50") + xlab("CENPA expression (log2TPM)") + ylab("NE50 score") +
  scale_color_discrete(name="CENPA category") + scale_shape_discrete(name="NE50 subtype") + 
  geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

cor.test(metadata_george_filter$CENPA.tpm,metadata_george_filter$NE50.score,method="spearman")
## 
##  Spearman's rank correlation rho
## 
## data:  metadata_george_filter$CENPA.tpm and metadata_george_filter$NE50.score
## S = 15624, p-value = 0.0001171
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.4936479
# Score vs category
ggplot(metadata_george_filter,aes(x=NE50.subtype,y=CENPA.tpm)) + geom_boxplot() + 
  ggtitle("George, CENPA vs. NE50") + xlab("NE50 subtype") + ylab("CENPA expression (log2TPM)")

wilcox.test(metadata_george_filter$CENPA.tpm~metadata_george_filter$NE50.subtype)
## 
##  Wilcoxon rank sum exact test
## 
## data:  metadata_george_filter$CENPA.tpm by metadata_george_filter$NE50.subtype
## W = 490, p-value = 0.001125
## alternative hypothesis: true location shift is not equal to 0
# Category vs category
ggplot(metadata_george_filter,aes(x=NE50.subtype,fill=CENPA.category)) + geom_bar(position="fill") +
  ggtitle("George, CENPA vs. NE50") + xlab("NE50 subtype") + ylab("Proportion") + 
  scale_fill_discrete("CENPA category")

table(metadata_george_filter$CENPA.category,metadata_george_filter$NE50.subtype)
##       
##        NE Non-NE
##   High 25      3
##   Low  17     12
chisq.test(metadata_george_filter$CENPA.category,metadata_george_filter$NE50.subtype)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  metadata_george_filter$CENPA.category and metadata_george_filter$NE50.subtype
## X-squared = 5.4175, df = 1, p-value = 0.01994

Conclusion: In the George cohort, the NE score moderately correlates with CENPA expression, and NE samples have a higher CENPA expression.

# Score vs score
ggplot(metadata_lissa,aes(x=CENPA.tpm,y=NE50.score)) + geom_point(aes(color=CENPA.category,shape=NE50.subtype)) +
  geom_vline(xintercept=median(metadata_lissa$CENPA.tpm),linetype="dashed") +
  geom_hline(yintercept=0,linetype="dashed") + 
  ggtitle("Lissa, CENPA vs. NE50") + xlab("CENPA expression (log2TPM)") + ylab("NE50 score") +
  scale_color_discrete(name="CENPA category") + scale_shape_discrete(name="NE50 subtype") + 
  geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

cor.test(metadata_lissa$CENPA.tpm,metadata_lissa$NE50.score,method="spearman")
## Warning in cor.test.default(metadata_lissa$CENPA.tpm,
## metadata_lissa$NE50.score, : Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  metadata_lissa$CENPA.tpm and metadata_lissa$NE50.score
## S = 109483, p-value = 0.000476
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.3430373
# Score vs category
ggplot(metadata_lissa,aes(x=NE50.subtype,y=CENPA.tpm)) + geom_boxplot() + 
  ggtitle("Lissa, CENPA vs. NE50") + xlab("NE50 subtype") + ylab("CENPA expression (log2TPM)")

wilcox.test(metadata_lissa$CENPA.tpm~metadata_lissa$NE50.subtype)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  metadata_lissa$CENPA.tpm by metadata_lissa$NE50.subtype
## W = 1662, p-value = 0.002859
## alternative hypothesis: true location shift is not equal to 0
# Category vs category
ggplot(metadata_lissa,aes(x=NE50.subtype,fill=CENPA.category)) + geom_bar(position="fill") +
  ggtitle("Lissa, CENPA vs. NE50") + xlab("NE50 subtype") + ylab("Proportion") + 
  scale_fill_discrete("CENPA category")

table(metadata_lissa$CENPA.category,metadata_lissa$NE50.subtype)
##       
##        NE Non-NE
##   High 32     18
##   Low  24     26
chisq.test(metadata_lissa$CENPA.category,metadata_lissa$NE50.subtype)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  metadata_lissa$CENPA.category and metadata_lissa$NE50.subtype
## X-squared = 1.9886, df = 1, p-value = 0.1585

Conclusion: There is a weaker correlation between NE score and CENPA expression in the Lissa cohort.

Compare to CES

# Score vs score
ggplot(metadata_george_filter,aes(x=CES.tpm,y=NE50.score)) + geom_point(aes(color=CES.category,shape=NE50.subtype)) +
  geom_vline(xintercept=median(metadata_george_filter$CES.tpm),linetype="dashed") +
  geom_hline(yintercept=0,linetype="dashed") + 
  ggtitle("George, CES vs. NE50") + xlab("CES expression (log2TPM)") + ylab("NE50 score") +
  scale_color_discrete(name="CES category") + scale_shape_discrete(name="NE50 subtype") + 
  geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

cor.test(metadata_george_filter$CES.tpm,metadata_george_filter$NE50.score,method="spearman")
## 
##  Spearman's rank correlation rho
## 
## data:  metadata_george_filter$CES.tpm and metadata_george_filter$NE50.score
## S = 11198, p-value = 2.24e-07
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.6370884
# Score vs category
ggplot(metadata_george_filter,aes(x=NE50.subtype,y=CES.tpm)) + geom_boxplot() + 
  ggtitle("George, CES vs. NE50") + xlab("NE50 subtype") + ylab("CES expression (log2TPM)")

wilcox.test(metadata_george_filter$CES.tpm~metadata_george_filter$NE50.subtype)
## 
##  Wilcoxon rank sum exact test
## 
## data:  metadata_george_filter$CES.tpm by metadata_george_filter$NE50.subtype
## W = 521, p-value = 9.195e-05
## alternative hypothesis: true location shift is not equal to 0
# Category vs category
ggplot(metadata_george_filter,aes(x=NE50.subtype,fill=CES.category)) + geom_bar(position="fill") +
  ggtitle("George, CES vs. NE50") + xlab("NE50 subtype") + ylab("Proportion") + 
  scale_fill_discrete("CES category")

table(metadata_george_filter$CES.category,metadata_george_filter$NE50.subtype)
##       
##        NE Non-NE
##   High 26      2
##   Low  16     13
chisq.test(metadata_george_filter$CES.category,metadata_george_filter$NE50.subtype)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  metadata_george_filter$CES.category and metadata_george_filter$NE50.subtype
## X-squared = 8.5803, df = 1, p-value = 0.003398
# Score vs score
ggplot(metadata_lissa,aes(x=CES.tpm,y=NE50.score)) + geom_point(aes(color=CES.category,shape=NE50.subtype)) +
  geom_vline(xintercept=median(metadata_lissa$CES.tpm),linetype="dashed") +
  geom_hline(yintercept=0,linetype="dashed") + 
  ggtitle("Lissa, CES vs. NE50") + xlab("CES expression (log2TPM)") + ylab("NE50 score") +
  scale_color_discrete(name="CES category") + scale_shape_discrete(name="NE50 subtype") + 
  geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

cor.test(metadata_lissa$CES.tpm,metadata_lissa$NE50.score,method="spearman")
## 
##  Spearman's rank correlation rho
## 
## data:  metadata_lissa$CES.tpm and metadata_lissa$NE50.score
## S = 101464, p-value = 6.545e-05
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.3911551
# Score vs category
ggplot(metadata_lissa,aes(x=NE50.subtype,y=CES.tpm)) + geom_boxplot() + 
  ggtitle("Lissa, CES vs. NE50") + xlab("NE50 subtype") + ylab("CES expression (log2TPM)")

wilcox.test(metadata_lissa$CES.tpm~metadata_lissa$NE50.subtype)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  metadata_lissa$CES.tpm by metadata_lissa$NE50.subtype
## W = 1730, p-value = 0.000551
## alternative hypothesis: true location shift is not equal to 0
# Category vs category
ggplot(metadata_lissa,aes(x=NE50.subtype,fill=CES.category)) + geom_bar(position="fill") +
  ggtitle("Lissa, CES vs. NE50") + xlab("NE50 subtype") + ylab("Proportion") + 
  scale_fill_discrete("CES category")

table(metadata_lissa$CES.category,metadata_lissa$NE50.subtype)
##       
##        NE Non-NE
##   High 35     15
##   Low  21     29
chisq.test(metadata_lissa$CES.category,metadata_lissa$NE50.subtype)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  metadata_lissa$CES.category and metadata_lissa$NE50.subtype
## X-squared = 6.8588, df = 1, p-value = 0.008821

Conclusion: There is also a moderate correlation between NE score and CES score for both cohorts.

SCNC subtyping

subtype_tf = c("ASCL1","NEUROD1","POU2F3","YAP1")

Assign SCNC subtype based on the transcription factor with the highest normalized expression within each sample.

George

george.tf = george.tpm[genes[match(subtype_tf,genes$GeneName),]$gene_id,]
rownames(george.tf) = subtype_tf
george.tf = apply(george.tf,2,function(x) (x-mean(x))/sd(x))
george.clust = hclust(dist(t(george.tf)))

Heatmap(matrix=george.tf,
  cluster_columns = george.clust,
  cluster_rows = FALSE,
  show_column_names = TRUE,
  show_row_dend = FALSE,
  heatmap_legend_param = list(
     title = "Expression\nZ-score",
     labels_gp = gpar(fontsize = 12),
     title_gp = gpar(fontsize = 12)
  ),
  column_dend_height = unit(1,"inches")
)

# Assign to greatest Z-score
george.subtype = data.frame(Sample=george.clust$labels[george.clust$order],
                            Cluster=c(rep("SCNC-Y",2),rep("SCNC-P",8),rep("SCNC-A",41),rep("SCNC-P",1),rep("SCNC-N",5)))
george.subtype$Subtype = apply(george.tf,2,function(x) row.names(george.tf[which.max(x),,drop=FALSE]))[george.subtype$Sample]

#table(george.subtype$Cluster,george.subtype$Subtype)

# Add to metadata
metadata_george_filter$SCNC.subtype = george.subtype[match(metadata_george_filter$`Sample-ID`,
                                                      george.subtype$Sample),]$Subtype
metadata_george_filter$SCNC.subtype = mapvalues(metadata_george_filter$SCNC.subtype,subtype_tf,c("SCNC-A","SCNC-N","SCNC-P","SCNC-Y"))

table(metadata_george_filter$SCNC.subtype)
## 
## SCNC-A SCNC-N SCNC-P SCNC-Y 
##     41      5      9      2

Compare to SCNC assignments from Rudin et al

subtype.rudin = read.xlsx("metadata/41568_2019_133_MOESM2_ESM.xlsx")
subtype.rudin = subtype.rudin[which(subtype.rudin$Source == "George (2015)"),]
subtype.rudin$Name = gsub("T$","",subtype.rudin$Name)

subtype.rudin = merge(subtype.rudin,george.subtype,by.x="Name",by.y="Sample")
table(subtype.rudin$Subtype.assignment,subtype.rudin$Subtype)
##         
##          ASCL1 NEUROD1 POU2F3 YAP1
##   SCLC-A    37       0      0    0
##   SCLC-N     0       5      0    0
##   SCLC-P     0       0      7    0
##   SCLC-Y     0       0      0    2

Conclusion: The George SCNC subtypes completely correspond with subtypes assigned in Rudin et al.

Lissa

lissa.tf = lissa.tpm[genes[match(subtype_tf,genes$GeneName),]$gene_id,]
rownames(lissa.tf) = subtype_tf
lissa.tf = apply(lissa.tf,2,function(x) (x-mean(x))/sd(x))

# Test with data from paper
#lissa.tf = subtype.fig2f
#rownames(lissa.tf) = subtype.fig2f$Sample_id
#lissa.tf = apply(t(lissa.tf[,subtype_tf]),2,function(x) (x-mean(x))/sd(x))

lissa.clust = hclust(dist(t(lissa.tf)))

Heatmap(matrix=lissa.tf,
  cluster_columns = lissa.clust,
  cluster_rows = FALSE,
  show_column_names = TRUE,
  show_row_dend = FALSE,
  heatmap_legend_param = list(
     title = "Expression\nZ-score",
     labels_gp = gpar(fontsize = 12),
     title_gp = gpar(fontsize = 12)
  ),
  column_dend_height = unit(1,"inches")
)

# Assign to TF with greatest expression
lissa.subtype = data.frame(Sample=lissa.clust$labels[lissa.clust$order],
                            Cluster=c(rep("SCNC-Y",28),rep("SCNC-A",12),rep("SCNC-N",16),rep("SCNC-P",2),rep("SCNC-Y",19),rep("SCNC-A",23)))
lissa.subtype$Subtype = apply(lissa.tf,2,function(x) row.names(lissa.tf[which.max(x),,drop=FALSE]))[lissa.subtype$Sample]

#table(lissa.subtype$Cluster,lissa.subtype$Subtype)

# Add to metadata
metadata_lissa$SCNC.subtype = lissa.subtype[match(metadata_lissa$NCI_sample_ID.RNA,
                                                      lissa.subtype$Sample),]$Subtype
metadata_lissa$SCNC.subtype = mapvalues(metadata_lissa$SCNC.subtype,subtype_tf,c("SCNC-A","SCNC-N","SCNC-P","SCNC-Y"))
rm(lissa.subtype)

table(metadata_lissa$Disease,metadata_lissa$SCNC.subtype)
##                   
##                    SCNC-A SCNC-N SCNC-P SCNC-Y
##   EP_bladder            0      1      0      0
##   EP_cervix             0      1      0      2
##   EP_EGFRmt_trSCLC      0      1      0      3
##   EP_ovary              0      0      0      1
##   EP_prostate           1      0      0      0
##   EP_rectum             1      0      0      0
##   EP_subglottis         0      0      0      1
##   SCLC                 36     15      2     35

Compare to SCNC assignments from Lissa et al Figure 2h source data

subtype.fig2h = read.xlsx("metadata/41467_2022_29517_MOESM9_ESM.xlsx",sheet="Fig 2h")
subtype.fig2h = merge(subtype.fig2h,metadata_lissa[,c("dbGap sample ID","SCNC.subtype")],
                      by.x="Sample_id",by.y="dbGap sample ID")

table(subtype.fig2h$Subtype,subtype.fig2h$SCNC.subtype)
##         
##          SCNC-A SCNC-N SCNC-P SCNC-Y
##   SCNC-A     23      0      0      7
##   SCNC-N      3     14      0      3
##   SCNC-P      0      0      1      0
##   SCNC-Y      2      1      0     18

Conclusion: Hierarchical clustering leads to less clear subtype delineations for the Lissa cohort, and several samples are differently classified compared to Lissa et al.

Compare to CENPA

# Score vs category
ggplot(metadata_george_filter,aes(x=SCNC.subtype,y=CENPA.tpm)) + geom_boxplot() + 
  ggtitle("George, CENPA vs. SCNC") + xlab("SCNC subtype") + ylab("CENPA expression (log2TPM)")

pairwise.wilcox.test(metadata_george_filter$CENPA.tpm,metadata_george_filter$SCNC.subtype)
## 
##  Pairwise comparisons using Wilcoxon rank sum exact test 
## 
## data:  metadata_george_filter$CENPA.tpm and metadata_george_filter$SCNC.subtype 
## 
##        SCNC-A SCNC-N SCNC-P
## SCNC-N 1.00   -      -     
## SCNC-P 0.25   0.25   -     
## SCNC-Y 1.00   0.76   1.00  
## 
## P value adjustment method: holm
# Category vs category
ggplot(metadata_george_filter,aes(x=SCNC.subtype,fill=CENPA.category)) + geom_bar(position="fill") +
  ggtitle("George, CENPA vs. SCNC") + xlab("SCNC subtype") + ylab("Proportion") + 
  scale_fill_discrete("CENPA category")

table(metadata_george_filter$CENPA.category,metadata_george_filter$SCNC.subtype)
##       
##        SCNC-A SCNC-N SCNC-P SCNC-Y
##   High     21      4      2      1
##   Low      20      1      7      1
chisq.test(metadata_george_filter$CENPA.category,metadata_george_filter$SCNC.subtype)
## Warning in chisq.test(metadata_george_filter$CENPA.category,
## metadata_george_filter$SCNC.subtype): Chi-squared approximation may be
## incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  metadata_george_filter$CENPA.category and metadata_george_filter$SCNC.subtype
## X-squared = 4.586, df = 3, p-value = 0.2047

Conclusion: CENPA expression is lower in the SCNC-Y subtype, but not significantly so.

# Score vs category
ggplot(metadata_lissa,aes(x=SCNC.subtype,y=CENPA.tpm)) + geom_boxplot() + 
  ggtitle("Lissa, CENPA vs. SCNC") + xlab("SCNC subtype") + ylab("CENPA expression (log2TPM)")

pairwise.wilcox.test(metadata_lissa$CENPA.tpm,metadata_lissa$SCNC.subtype)
## Warning in wilcox.test.default(xi, xj, paired = paired, ...): cannot compute
## exact p-value with ties
## 
##  Pairwise comparisons using Wilcoxon rank sum exact test 
## 
## data:  metadata_lissa$CENPA.tpm and metadata_lissa$SCNC.subtype 
## 
##        SCNC-A SCNC-N SCNC-P
## SCNC-N 0.974  -      -     
## SCNC-P 1.000  1.000  -     
## SCNC-Y 0.006  0.236  0.761 
## 
## P value adjustment method: holm
# Category vs category
ggplot(metadata_lissa,aes(x=SCNC.subtype,fill=CENPA.category)) + geom_bar(position="fill") +
  ggtitle("Lissa, CENPA vs. SCNC") + xlab("SCNC subtype") + ylab("Proportion") + 
  scale_fill_discrete("CENPA category")

table(metadata_lissa$CENPA.category,metadata_lissa$SCNC.subtype)
##       
##        SCNC-A SCNC-N SCNC-P SCNC-Y
##   High     25     10      2     13
##   Low      13      8      0     29
chisq.test(metadata_lissa$CENPA.category,metadata_lissa$SCNC.subtype)
## Warning in chisq.test(metadata_lissa$CENPA.category,
## metadata_lissa$SCNC.subtype): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  metadata_lissa$CENPA.category and metadata_lissa$SCNC.subtype
## X-squared = 12.107, df = 3, p-value = 0.007026

Conclusion: CENPA category differs across the SCNC subtypes, with the lowest expression in SCNC-Y.

Compare to CES

# Score vs category
ggplot(metadata_george_filter,aes(x=SCNC.subtype,y=CES.tpm)) + geom_boxplot() + 
  ggtitle("George, CES vs. SCNC") + xlab("SCNC subtype") + ylab("CES expression (log2TPM)")

pairwise.wilcox.test(metadata_george_filter$CES.tpm,metadata_george_filter$SCNC.subtype)
## 
##  Pairwise comparisons using Wilcoxon rank sum exact test 
## 
## data:  metadata_george_filter$CES.tpm and metadata_george_filter$SCNC.subtype 
## 
##        SCNC-A SCNC-N SCNC-P
## SCNC-N 0.073  -      -     
## SCNC-P 0.129  0.012  -     
## SCNC-Y 0.749  0.571  0.909 
## 
## P value adjustment method: holm
# Category vs category
ggplot(metadata_george_filter,aes(x=SCNC.subtype,fill=CES.category)) + geom_bar(position="fill") +
  ggtitle("George, CES vs. SCNC") + xlab("SCNC subtype") + ylab("Proportion") + 
  scale_fill_discrete("CES category")

table(metadata_george_filter$CES.category,metadata_george_filter$SCNC.subtype)
##       
##        SCNC-A SCNC-N SCNC-P SCNC-Y
##   High     22      4      1      1
##   Low      19      1      8      1
chisq.test(metadata_george_filter$CES.category,metadata_george_filter$SCNC.subtype)
## Warning in chisq.test(metadata_george_filter$CES.category,
## metadata_george_filter$SCNC.subtype): Chi-squared approximation may be
## incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  metadata_george_filter$CES.category and metadata_george_filter$SCNC.subtype
## X-squared = 7.4487, df = 3, p-value = 0.05889

Conclusion: The CES score is higher in the SCNC-N subtype vs. the SCNC-P subtype.

# Score vs category
ggplot(metadata_lissa,aes(x=SCNC.subtype,y=CES.tpm)) + geom_boxplot() + 
  ggtitle("Lissa, CES vs. SCNC") + xlab("SCNC subtype") + ylab("CES expression (log2TPM)")

pairwise.wilcox.test(metadata_lissa$CES.tpm,metadata_lissa$SCNC.subtype)
## 
##  Pairwise comparisons using Wilcoxon rank sum exact test 
## 
## data:  metadata_lissa$CES.tpm and metadata_lissa$SCNC.subtype 
## 
##        SCNC-A SCNC-N SCNC-P
## SCNC-N 1.0000 -      -     
## SCNC-P 1.0000 1.0000 -     
## SCNC-Y 0.0021 0.2283 0.4736
## 
## P value adjustment method: holm
# Category vs category
ggplot(metadata_lissa,aes(x=SCNC.subtype,fill=CES.category)) + geom_bar(position="fill") +
  ggtitle("Lissa, CES vs. SCNC") + xlab("SCNC subtype") + ylab("Proportion") + 
  scale_fill_discrete("CES category")

table(metadata_lissa$CES.category,metadata_lissa$SCNC.subtype)
##       
##        SCNC-A SCNC-N SCNC-P SCNC-Y
##   High     24      9      2     15
##   Low      14      9      0     27
chisq.test(metadata_lissa$CES.category,metadata_lissa$SCNC.subtype)
## Warning in chisq.test(metadata_lissa$CES.category,
## metadata_lissa$SCNC.subtype): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  metadata_lissa$CES.category and metadata_lissa$SCNC.subtype
## X-squared = 8.0602, df = 3, p-value = 0.04478

Conclusion: The CES score is lower in the SCNC-Y subtype than the the SCNC-A subtype.

NE signature vs. subtyping

George

table(metadata_george_filter$NE50.subtype,metadata_george_filter$SCNC.subtype)
##         
##          SCNC-A SCNC-N SCNC-P SCNC-Y
##   NE         36      5      1      0
##   Non-NE      5      0      8      2
chisq.test(metadata_george_filter$NE50.subtype,metadata_george_filter$SCNC.subtype)
## Warning in chisq.test(metadata_george_filter$NE50.subtype,
## metadata_george_filter$SCNC.subtype): Chi-squared approximation may be
## incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  metadata_george_filter$NE50.subtype and metadata_george_filter$SCNC.subtype
## X-squared = 29.775, df = 3, p-value = 1.539e-06

Lissa

table(metadata_lissa$NE50.subtype,metadata_lissa$SCNC.subtype)
##         
##          SCNC-A SCNC-N SCNC-P SCNC-Y
##   NE         32     14      0     10
##   Non-NE      6      4      2     32
chisq.test(metadata_lissa$NE50.subtype,metadata_lissa$SCNC.subtype)
## Warning in chisq.test(metadata_lissa$NE50.subtype,
## metadata_lissa$SCNC.subtype): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  metadata_lissa$NE50.subtype and metadata_lissa$SCNC.subtype
## X-squared = 35.946, df = 3, p-value = 7.686e-08

Conclusion: The NE subtype is significantly associated with the SCNC subtype, as expected.

Aneuploidy

Cohort: Lissa et al (relapse/metastasis), 29 WXS tumor/normal pairs available via the Oncogenomics portal

Method:

  • Oncogenomics pipeline was run on all cases as part of Lissa et al publication
  • Archived results downloaded from HPC DME
  • Allele-specific copy number profiles generated with:
    • Sequenza (run as part of Oncogenomics pipeline)
    • FACETS (run on tumor/normal alignments)
  • Aneuploidy metrics calculated

Aneuploidy metrics

From Oncogenomics portal (calculated from Sequenza):

  • Non.diploid.Length - non-diploid length
  • Non.diploid.Ratio - ratio of non-diploid length to total length
  • Total.segments - total number of segments

From FACETS summary table:

  • facets.hypoploid - whether true hypoploidy exists
  • facets.fraction_loh - fraction LOH

From FACETS QC output:

  • facets.wgd - whether whole genome duplication occurred
  • facets.fga - fraction of genome altered
  • facets.n_segs - number of segments

From FACETS chromosome arm-level summary:

  • facets.arm - whether any chromosome arm-level alterations are reported

ABSOLUTE (Carter 2012 Nature Computational Biology)

Original manuscript identified genome doubling via ploidy inflection points per cancer type, which corresponds to a transition in major copy number (MCN). MCN will be primarily even if doubling has occurred. Calculated for Sequenza

  • wgd.cnmode - the most prevalent MCN by length
  • wgd.absolute - whether the most prevalent MCN by length is >1

Major copy number (Bielski 2018 Nat Genetics)

WGD is called if >50% of autosomal genome had MCN >= 2. Calculated for Sequenza

  • wgd.mcn - whether length of segments with MCN >= 2 is >50% total segment length, autosomes only

CIN metrics (Oza 2023 PeerJ)

Algorithms independently implemented (R package takes only array-based segment data from TCGA). Calculated for both Sequenza and FACETS. Except where noted, considers only segments with absolute log2 copy number ratio >= 0.2

  • CIN.TAI: Total Aberration Index - weighted mean absolute copy number
  • CIN.modifiedTAI: modified Total Aberration Index - considers directionality; no threshold
  • CIN.CNA: Copy Number Abnormality - number of segments; absolute log2 copy number ratio >= log2(1.7)-1, >= 0.2 difference from previous segment, length >= 10bp only
  • CIN.breakpoints: number of break points - 2*number of segments
  • CIN.bases: length of altered base segments - sum of altered bases
  • CIN.FGA: Fraction of Genome Altered - sum of altered bases over total chromosome length, autosomes only
aneuploidy.files = list.files("wgs/Lissa/aneuploidy",pattern="anueploidy_metrics.txt",full.names = TRUE)
aneuploidy = lapply(aneuploidy.files,function(x) read.table(x,header=TRUE,sep = '\t'))
names(aneuploidy) = gsub(".anueploidy_metrics.txt","",gsub("wgs/Lissa/aneuploidy/","",aneuploidy.files))
aneuploidy = ldply(aneuploidy,.id="Sample")

aneuploidy$wgd.mcn = ifelse(aneuploidy$wgd.mcn == "Yes",TRUE,FALSE)
aneuploidy$wgd.absolute = ifelse(aneuploidy$wgd.absolute == "Yes",TRUE,FALSE)
aneuploidy$facets.arm = ifelse(aneuploidy$facets.arm == "Yes",TRUE,FALSE)
#Add aneuploidy metric from Oncogenomics portal
frac_diploid = read.table("wgs/Lissa/23461.sequenza.summary.tsv",header=TRUE,sep='\t')
colnames(frac_diploid) = mapvalues(colnames(frac_diploid),c("Ratio"),c("Non.diploid.Ratio"))

aneuploidy = merge(aneuploidy,frac_diploid[,c("Sample.ID","Sample.Name","Case.ID","Non.diploid.Length","Non.diploid.Ratio","Total.segments")],
                   by.x="Sample",
                   by.y="Sample.ID",
                   all.x=TRUE)

rm(frac_diploid)

aneuploidy.continuous = c("facets.fraction_loh","facets.fga","facets.n_segs","wgd.cnmode","sequenza.CIN.TAI","sequenza.CIN.modifiedTAI","sequenza.CIN.CNA","sequenza.CIN.breakpoints","sequenza.CIN.bases","sequenza.CIN.FGA","facets.CIN.TAI","facets.CIN.modifiedTAI","facets.CIN.CNA","facets.CIN.breakpoints","facets.CIN.bases","facets.CIN.FGA","Non.diploid.Length","Non.diploid.Ratio","Total.segments")

aneuploidy.discrete = c("facets.hypoploid","facets.arm","facets.wgd","wgd.mcn","wgd.absolute")

Sequenza vs FACETS purity/ploidy

ggplot(aneuploidy,aes(x=sequenza.cellularity,y=facets.purity)) + geom_point() + 
  xlab("Sequenza cellularity") + ylab("FACETS purity") + 
  xlim(0,1) + ylim(0,1) + 
  geom_smooth(method=lm) 
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).

cor.test(aneuploidy$sequenza.cellularity,aneuploidy$facets.purity)
## 
##  Pearson's product-moment correlation
## 
## data:  aneuploidy$sequenza.cellularity and aneuploidy$facets.purity
## t = 4.6225, df = 26, p-value = 9.105e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3983867 0.8353896
## sample estimates:
##       cor 
## 0.6716387
ggplot(aneuploidy,aes(x=sequenza.ploidy,y=facets.ploidy)) + geom_point() + 
  xlab("Sequenza ploidy") + ylab("FACETS ploidy") + 
  xlim(1,7) + ylim(1,7) + 
  geom_smooth(method=lm) 
## `geom_smooth()` using formula = 'y ~ x'

cor.test(aneuploidy$sequenza.ploidy,aneuploidy$facets.ploidy)
## 
##  Pearson's product-moment correlation
## 
## data:  aneuploidy$sequenza.ploidy and aneuploidy$facets.ploidy
## t = 4.8537, df = 27, p-value = 4.512e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4215869 0.8391761
## sample estimates:
##       cor 
## 0.6826185

Conclusion: Ploidy and purity/cellularity calls are moderately correlated between FACETS and Sequenza.

Ploidy inflection point

The ABSOLUTE paper (Carter 2012) identified WGDs based on the ploidy inflection point for each cancer (Supplementary Figure 9).

ggplot(aneuploidy,aes(x=rank(facets.ploidy,ties="random"),y=facets.ploidy,color=facets.wgd)) + geom_point() +
  xlab("Ploidy rank") + ylab("FACETS ploidy") + scale_color_discrete(name="FACETS WGD")

Conclusion: Two ploidy inflection points are visible for SCNC, potentially corresponding to single and multiple WGD events, and the first inflection point agrees with WGD as called by FACETS.

Metric concordance

Continuous metrics: Pearson correlation

aneuploidy.cor = cor(aneuploidy[,aneuploidy.continuous],use="pairwise.complete.obs")

Heatmap(aneuploidy.cor,col=colorRamp2(c(-1, 0, 1), c("blue", "white", "red")))

Discrete metrics: % concordant

apply(aneuploidy[,aneuploidy.discrete],2,function(x)
  apply(aneuploidy[,aneuploidy.discrete],2,function(y)
    sum(x == y)/length(x)))
##                  facets.hypoploid facets.arm facets.wgd   wgd.mcn wgd.absolute
## facets.hypoploid       1.00000000 0.03448276  0.2413793 0.3103448    0.3793103
## facets.arm             0.03448276 1.00000000  0.7241379 0.7241379    0.6551724
## facets.wgd             0.24137931 0.72413793  1.0000000 0.8620690    0.8620690
## wgd.mcn                0.31034483 0.72413793  0.8620690 1.0000000    0.9310345
## wgd.absolute           0.37931034 0.65517241  0.8620690 0.9310345    1.0000000

Conclusion: There is high correlation between the aneuploidy metrics, which divide roughly into metrics based on the length of the altered genome and metrics based on the number of altered segments. FACETS and Sequenza generally agree.

Compare aneuploidy and CENP-A expression

For each aneuploidy metric, test for significant association with CENP-A expression and CES score. Sorted p-values are printed for all metrics (no multiple hypothesis correction) and significant associations are plotted.

metadata_lissa_aneuploidy = merge(metadata_lissa,
                           aneuploidy,
                           by.x="NCI_sample_ID.tumor",
                           by.y="Sample.Name")

CENP-A vs. aneuploidy

# CENP-A TPM vs. continuous metrics
sort(apply(metadata_lissa_aneuploidy[,aneuploidy.continuous],2,function(x)
  as.numeric(unlist(cor.test(metadata_lissa_aneuploidy$CENPA.tpm,x,method="spearman"))["p.value"])))
## Warning in cor.test.default(metadata_lissa_aneuploidy$CENPA.tpm, x, method =
## "spearman"): Cannot compute exact p-value with ties

## Warning in cor.test.default(metadata_lissa_aneuploidy$CENPA.tpm, x, method =
## "spearman"): Cannot compute exact p-value with ties

## Warning in cor.test.default(metadata_lissa_aneuploidy$CENPA.tpm, x, method =
## "spearman"): Cannot compute exact p-value with ties

## Warning in cor.test.default(metadata_lissa_aneuploidy$CENPA.tpm, x, method =
## "spearman"): Cannot compute exact p-value with ties

## Warning in cor.test.default(metadata_lissa_aneuploidy$CENPA.tpm, x, method =
## "spearman"): Cannot compute exact p-value with ties

## Warning in cor.test.default(metadata_lissa_aneuploidy$CENPA.tpm, x, method =
## "spearman"): Cannot compute exact p-value with ties
##           facets.CIN.TAI         sequenza.CIN.TAI sequenza.CIN.breakpoints 
##                0.0377728                0.1114889                0.2249041 
##           Total.segments               facets.fga   facets.CIN.breakpoints 
##                0.3505371                0.4518444                0.4826533 
##            facets.n_segs        Non.diploid.Ratio      facets.fraction_loh 
##                0.5046517                0.5711288                0.6091188 
##   facets.CIN.modifiedTAI sequenza.CIN.modifiedTAI           facets.CIN.CNA 
##                0.6261078                0.6352256                0.7252966 
##         sequenza.CIN.CNA           facets.CIN.FGA       Non.diploid.Length 
##                0.7445796                0.7601180                0.7993599 
##         facets.CIN.bases               wgd.cnmode       sequenza.CIN.bases 
##                0.8530996                0.8803755                0.9745376 
##         sequenza.CIN.FGA 
##                1.0000000
# CENP-A TPM vs. discrete metrics
sort(apply(metadata_lissa_aneuploidy[,setdiff(aneuploidy.discrete,"facets.arm")],2,function(x)
  as.numeric(unlist(wilcox.test(metadata_lissa_aneuploidy$CENPA.tpm~x))["p.value"])))
## facets.hypoploid          wgd.mcn     wgd.absolute       facets.wgd 
##       0.06896552       0.42857126       0.42866858       0.64949809
ggplot(metadata_lissa_aneuploidy,aes(x=CENPA.tpm,y=facets.CIN.TAI)) + geom_point(aes(color=CENPA.category)) +
  geom_vline(xintercept=median(metadata_lissa_aneuploidy$CENPA.tpm),linetype="dashed") +
  geom_hline(yintercept=median(metadata_lissa_aneuploidy$facets.CIN.TAI),linetype="dashed") + 
  ggtitle("Lissa, CENPA vs. FACETS TAI") + xlab("CENPA expression (log2TPM)") + ylab("FACETS TAI") +
  scale_color_discrete(name="CENPA category") + 
  geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

cor.test(metadata_lissa_aneuploidy$CENPA.tpm,metadata_lissa_aneuploidy$facets.CIN.TAI,method="spearman")
## 
##  Spearman's rank correlation rho
## 
## data:  metadata_lissa_aneuploidy$CENPA.tpm and metadata_lissa_aneuploidy$facets.CIN.TAI
## S = 2480, p-value = 0.03777
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.3891626

CENP-A category vs aneuploidy

# CENP-A category vs. continuous metrics
sort(apply(metadata_lissa_aneuploidy[,aneuploidy.continuous],2,function(x)
  as.numeric(unlist(wilcox.test(x~metadata_lissa_aneuploidy$CENPA.category))["p.value"])))
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties

## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties

## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties

## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties

## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties

## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
##           facets.CIN.TAI         sequenza.CIN.TAI sequenza.CIN.breakpoints 
##               0.02915903               0.11206979               0.14561932 
##           Total.segments            facets.n_segs   facets.CIN.breakpoints 
##               0.19032105               0.27029878               0.27517440 
##           facets.CIN.CNA         sequenza.CIN.CNA   facets.CIN.modifiedTAI 
##               0.40049439               0.50453927               0.50453927 
##         sequenza.CIN.FGA       sequenza.CIN.bases               facets.fga 
##               0.68295388               0.74719240               0.75992421 
##      facets.fraction_loh        Non.diploid.Ratio               wgd.cnmode 
##               0.79330382               0.80632816               0.87029182 
##         facets.CIN.bases       Non.diploid.Length           facets.CIN.FGA 
##               0.88047764               0.91447537               0.94862324 
## sequenza.CIN.modifiedTAI 
##               0.98286486
# CENP-A category vs. discrete metrics
sort(apply(metadata_lissa_aneuploidy[,setdiff(aneuploidy.discrete,"facets.arm")],2,function(x)
  as.numeric(unlist(chisq.test(x,metadata_lissa_aneuploidy$CENPA.category))["p.value"])))
## Warning in chisq.test(x, metadata_lissa_aneuploidy$CENPA.category): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(x, metadata_lissa_aneuploidy$CENPA.category): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(x, metadata_lissa_aneuploidy$CENPA.category): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(x, metadata_lissa_aneuploidy$CENPA.category): Chi-squared
## approximation may be incorrect
##       facets.wgd          wgd.mcn     wgd.absolute facets.hypoploid 
##        0.7633841        0.7633841        0.7978615        0.9719888
ggplot(metadata_lissa_aneuploidy,aes(x=CENPA.category,y=facets.CIN.TAI)) + geom_boxplot() +
  ggtitle("Lissa, CENPA vs. FACETS TAI") + xlab("CENPA category") + ylab("FACETS TAI")

CES vs. aneuploidy

# CES TPM vs. continuous metrics
sort(apply(metadata_lissa_aneuploidy[,aneuploidy.continuous],2,function(x)
  as.numeric(unlist(cor.test(metadata_lissa_aneuploidy$CES.tpm,x,method="spearman"))["p.value"])))
## Warning in cor.test.default(metadata_lissa_aneuploidy$CES.tpm, x, method =
## "spearman"): Cannot compute exact p-value with ties

## Warning in cor.test.default(metadata_lissa_aneuploidy$CES.tpm, x, method =
## "spearman"): Cannot compute exact p-value with ties

## Warning in cor.test.default(metadata_lissa_aneuploidy$CES.tpm, x, method =
## "spearman"): Cannot compute exact p-value with ties

## Warning in cor.test.default(metadata_lissa_aneuploidy$CES.tpm, x, method =
## "spearman"): Cannot compute exact p-value with ties

## Warning in cor.test.default(metadata_lissa_aneuploidy$CES.tpm, x, method =
## "spearman"): Cannot compute exact p-value with ties

## Warning in cor.test.default(metadata_lissa_aneuploidy$CES.tpm, x, method =
## "spearman"): Cannot compute exact p-value with ties
##           facets.CIN.TAI sequenza.CIN.breakpoints        Non.diploid.Ratio 
##              0.004690237              0.088171029              0.144648162 
##   facets.CIN.modifiedTAI           Total.segments               facets.fga 
##              0.153257988              0.172883176              0.175850910 
##   facets.CIN.breakpoints       Non.diploid.Length               wgd.cnmode 
##              0.218774978              0.222898355              0.322606685 
##         sequenza.CIN.TAI sequenza.CIN.modifiedTAI            facets.n_segs 
##              0.326808599              0.335880928              0.346437471 
##           facets.CIN.FGA           facets.CIN.CNA         facets.CIN.bases 
##              0.415419836              0.448751753              0.489950140 
##         sequenza.CIN.CNA       sequenza.CIN.bases         sequenza.CIN.FGA 
##              0.576085774              0.685353870              0.692901171 
##      facets.fraction_loh 
##              0.935274847
# CES TPM vs. discrete metrics
sort(apply(metadata_lissa_aneuploidy[,setdiff(aneuploidy.discrete,"facets.arm")],2,function(x)
  as.numeric(unlist(wilcox.test(metadata_lissa_aneuploidy$CES.tpm~x))["p.value"])))
## facets.hypoploid     wgd.absolute       facets.wgd          wgd.mcn 
##        0.1379310        0.1506983        0.2785190        0.3241689
ggplot(metadata_lissa_aneuploidy,aes(x=CES.tpm,y=facets.CIN.TAI)) + geom_point(aes(color=CES.category)) +
  geom_vline(xintercept=median(metadata_lissa_aneuploidy$CES.tpm),linetype="dashed") +
  geom_hline(yintercept=median(metadata_lissa_aneuploidy$facets.CIN.TAI),linetype="dashed") +
  ggtitle("Lissa, CES vs. FACETS TAI") + xlab("CES expression (log2TPM)") + ylab("FACETS TAI") +
  scale_color_discrete(name="CES category") +
  geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

cor.test(metadata_lissa_aneuploidy$CES.tpm,metadata_lissa_aneuploidy$facets.CIN.TAI,method="spearman")
## 
##  Spearman's rank correlation rho
## 
## data:  metadata_lissa_aneuploidy$CES.tpm and metadata_lissa_aneuploidy$facets.CIN.TAI
## S = 1966, p-value = 0.00469
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.5157635

CES category vs. aneuploidy

# CES category vs. continuous metrics
sort(apply(metadata_lissa_aneuploidy[,aneuploidy.continuous],2,function(x)
  as.numeric(unlist(wilcox.test(x~metadata_lissa_aneuploidy$CES.category))["p.value"])))
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties

## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties

## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties

## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties

## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties

## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
##           facets.CIN.TAI               facets.fga sequenza.CIN.breakpoints 
##              0.001081665              0.056839741              0.072543601 
##   facets.CIN.modifiedTAI           Total.segments   facets.CIN.breakpoints 
##              0.072543601              0.080201111              0.143893481 
##        Non.diploid.Ratio            facets.n_segs sequenza.CIN.modifiedTAI 
##              0.174847813              0.244925889              0.282897456 
##       Non.diploid.Length         sequenza.CIN.CNA               wgd.cnmode 
##              0.303256893              0.346724806              0.368460543 
##         sequenza.CIN.TAI           facets.CIN.FGA           facets.CIN.CNA 
##              0.418719925              0.418719925              0.471097052 
##         facets.CIN.bases       sequenza.CIN.bases         sequenza.CIN.FGA 
##              0.498539741              0.647057192              0.647057192 
##      facets.fraction_loh 
##              0.739654986
# CES category vs. discrete metrics
sort(apply(metadata_lissa_aneuploidy[,setdiff(aneuploidy.discrete,"facets.arm")],2,function(x)
  as.numeric(unlist(chisq.test(x,metadata_lissa_aneuploidy$CES.category))["p.value"])))
## Warning in chisq.test(x, metadata_lissa_aneuploidy$CES.category): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(x, metadata_lissa_aneuploidy$CES.category): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(x, metadata_lissa_aneuploidy$CES.category): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(x, metadata_lissa_aneuploidy$CES.category): Chi-squared
## approximation may be incorrect
##     wgd.absolute       facets.wgd          wgd.mcn facets.hypoploid 
##        0.2799424        0.3155851        0.3155851        1.0000000
ggplot(metadata_lissa_aneuploidy,aes(x=CES.category,y=facets.CIN.TAI)) + geom_boxplot() +
  ggtitle("Lissa, CES vs. FACETS TAI") + xlab("CES category") + ylab("FACETS TAI")

Conclusion: The only aneuploidy metric significantly correlated with CENP-A expression or CES score is facets.CIN.TAI (FACETS-based Total Aberration Index). In general, aneuploidy does not appear to correlate with CENP-A expression.